Hierarchical clustering using cutreeDynamic to automatically cut the tree
cluster<-genecluster(ratio_psedo,nct,method="hierarchical")
table(cluster)
## cluster
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 412 319 283 245 201 190 185 183 130 123 116 115 112 107 100 85 80 66
## 19 20 21 22
## 45 35 35 25
Using mclust to do the clustering
cluster<-genecluster(ratio_psedo,nct,G=c(4,8,12,16,20,24))
## model-based optimal number of clusters: 16
table(cluster)
## cluster
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 541 357 295 952 207 194 134 85 24 47 118 87 48 41 29 33
Modelling for each cluster
R=200
start_time <- Sys.time()
f <- ratio ~ p(X, pen = "gflasso")
cl <- makeCluster(ncores) # use 4 workers
registerDoParallel(cl) # register the parallel backend
start_time <- Sys.time()
ans <- pbsapply(1:max(cluster), function(i) { # try on cluster 9th and 10th first
set.seed(i)
print(paste("cluster",i))
feat<-which(cluster==i)
pheatmap(ratio_psedo[feat,], cluster_rows = FALSE, cluster_cols = FALSE,
annotation_col = anno_df,show_colnames = F,show_rownames = F)
# create dataframe for each cluster
cl_ratio<-as.vector(unlist(ratio[feat,]))
cl_total<-as.vector(unlist(total_QCed[feat,]))
data<-tibble(ratio=cl_ratio,X=factor(rep(celltype,each=length(feat)),levels = cell_meta_unique),cts=cl_total)
t <- system.time(fit<-fusedlasso(formula=f,model="binomial",data,ncores=ncores))[[3]] # saving the elapsed time
t2 <- system.time(fit2<-fusedlasso(formula=f,model="gaussian",data,ncores=ncores))[[3]] # saving the elapsed time
## bootstrap 200 replicates
t3<-system.time(boot <- boot(data,statistic=boot_fusedlasso,R=R,strata=data$X,formula=f,model="binomial",lambda1=fit$lambda,parallel="multicore",ncpus=ncores))[[3]]
t4<-system.time(boot2 <- boot(data,statistic=boot_fusedlasso,R=R,strata=data$X,formula=f,model="gaussian",lambda2=fit2$lambda,parallel="multicore",ncpus=ncores))[[3]]
## boxplot of bootstrap allelic ratio estimator
bootbox<-as_tibble(rbind(boot$t,boot2$t))
colnames(bootbox)<-cell_meta_unique
bootbox2<-tibble(bootbox,model=rep(c("bin","gau"),each=R))
bootbox3<-bootbox2 %>% gather(key="type",value = "ratio",zy:lateblast)
ggplot(data = bootbox3, mapping = aes(x=type, y=ratio, fill=model,col=model)) +
geom_boxplot()+ theme_classic()+
geom_jitter(shape=16, position=position_jitter(0.2),size=0.7)+
ylab("allelic ratio")+
xlab("")+scale_x_discrete(name ="Cell types",
limits=cell_meta_unique)
## consensus partation #################
bootcl <-apply(bootbox, 1, function(x) fmatch(x,unique(x)))
consens_par<-cl_consensus(cl_ensemble(list = apply(bootcl[,1:R],2,as.cl_hard_partition)),method = "SM")
class <- max.col(consens_par$.Data)
consens_par<-cl_consensus(cl_ensemble(list = apply(bootcl[,(R+1):(2*R)],2,as.cl_hard_partition)),method = "SM")
class2 <- max.col(consens_par$.Data)
## confidence interval ####
bootci<-apply(boot$t,2, function(x) quantile(x,probs = c(0.025,0.975))) #percentil bootstrap ci
bootci2<-matrix(rep(2*boot$t0,each=2),nrow = 2)-bootci #type 2 percentile
ratio_bc<-boot$t0 #estimator
boot2ci<-apply(boot2$t,2, function(x) quantile(x,probs = c(0.025,0.975))) #percentil bootstrap ci
boot2ci2<-matrix(rep(2*boot2$t0,each=2),nrow = 2)-boot2ci #type 2 percentile
ratio_bc2<-boot2$t0 #estimator
out<-list()
out[["time"]]<-c(t,t2,t3,t4)
out[["ratio"]]<-rbind(ratio_bc,ratio_bc2)
out[["ci"]]<-rbind(bootci2,boot2ci2)
out[["consensus"]]<-rbind(class,class2)
return(out)
}, cl=1)
## [1] "cluster 1"
## Warning: The `x` argument of `as_tibble.matrix()` must have column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## [1] "cluster 2"
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## [1] "cluster 3"
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## [1] "cluster 4"
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## [1] "cluster 5"
## [1] "cluster 6"
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## [1] "cluster 7"
## [1] "cluster 8"
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## [1] "cluster 9"
## [1] "cluster 10"
## [1] "cluster 11"
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## [1] "cluster 12"
## [1] "cluster 13"
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## [1] "cluster 14"
## [1] "cluster 15"
## [1] "cluster 16"
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
end_time <- Sys.time()
end_time - start_time
## Time difference of 1.256826 hours
stopCluster(cl)
save(ans,file = "/proj/milovelab/mu/SC-ASE/data/deng.rda")